#! /usr/bin/python3
# -*- coding:utf-8 -*- 
# Author: vicnic 
# Date: 2018-09-09 20:35:06 

import random
import requests
import hashlib
import os
import re
from lxml import html
import Settings as st

HaveLoadList = []
#正在使用的user-agent 
UsingHeaders = ''

def getHeads():
    agentList = st.user_agent_list
    random_num = random.randint(1,len(agentList))
    user_agent = agentList[random_num-1]
    myHeaders = {
        # 'Host' : st.user_host,#一般网页都不需要，看情况解开注释
        'User-Agent' : user_agent,
        # 'cookie':st.user_cookie,
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Connection':'keep-alive',#一般网页都不需要，看情况解开注释
    }
    UsingHeaders = myHeaders
    return myHeaders
# 获取返回的结果并直接解析成xpath返回给调用者
def getHtmlTrees(pageUrl):
    if urlIsInMD5(pageUrl) == False:   
        HaveLoadList.append(pageUrl)     
        response = requests.get(pageUrl,headers=getHeads())
        response.encoding = response.apparent_encoding#将response的编码设置为返回的网页编码
        # print(response.text)
        statueCode = response.status_code
        page_content = response.text
        if statueCode != 404:
            tree = html.fromstring(page_content)
            return tree
        else:
            print('TinySpider:'+pageUrl+'is 404')
# URL去重
def urlIsInMD5(str):
    m = hashlib.md5()
    m.update(str.encode("utf8"))
    md5Str = m.hexdigest()
    if md5Str not in HaveLoadList:
        HaveLoadList.append(md5Str)
        return False
    else:
        return True

def downLoadFile(picUrl):
    # urllib.urlretrieve(picUrl,'f:\\acgTitleImg\\'+path+'\\'+picName)#urllib一句代码下载   
    path = st.resorce_save_path 
    if os.path.exists(path) == False:
        os.makedirs(path)
    file_response = requests.get(picUrl)
    if file_response.status_code == 200:
        hl = hashlib.md5()
        utf_file_name = hl.update(picUrl.encode("utf8"))
        file_name = hl.hexdigest()
        type_str = judgeFileType(picUrl)
        if type_str is None:
            open(path+file_name, 'wb').write(file_response.content)
        else:
            open(path+file_name+'.'+type_str, 'wb').write(file_response.content)            


# 用来返回下载文件的后缀
def judgeFileType(file_url):
    regex_img_str = '(jpg|gif|bmp|png)'
    regex_video_str = '(mp3|mp4|avi|rmvb|flv|wmv|mov)'
    match_obj_img = getContenByRE(file_url,regex_img_str)
    if len(match_obj_img)>0:
        return match_obj_img
    else:
        match_obj_video = getContenByRE(file_url,regex_video_str)
        if len(match_obj_video)>0:
            return match_obj_video
        else:
            return None

#对给定的正则和文本进行筛选返回结果
def getContenByRE(content,re_str):
    match_obj = re.search(re_str,content,re.M|re.I)
    if match_obj:
        return match_obj.group()
    else:
        return ''
# def getMySQLCursor():
#     connect = pymysql.connect(
#         host = st.mysql_host, 
#         user = st.mysql_user, 
#         passwd = st.mysql_password,  
#         db = st.mysql_database,  
#         port = st.mysql_port,  
#         charset = st.mysql_charset
#     )
#     return connect.cursor()